/*
nfhs_raw
paul stainier
started 4/28/2021
use nfhs calendar data to make
district-year measures of conceptions and miscarriages
*/

***************************
*toggle yes/no to decide
*which programs to run
***************************

local nfhs_death_by_age = "yes"


***************************
*replace with own directories
***************************
global input_path = 
global output_path = 
global crosswalk_path = 


***********************
*nfhs_death
*calculate death rate 
*using the household 
*recode data 
*death rate = # deaths per year
*divided by # of household 
*members at time of survey 
***********************
if "`nfhs_death_by_age'" == "yes"{
	
	*************
	*2012-2015 using the 2015-16 survey
	*************
	*create crosswalk that can be merged with this
	cd $crosswalk_path 
	use nfhs4_nfhs5_district_crosswalk, clear 
	keep district_n4 statename4 
	duplicates drop
	tempfile crosswalk_just4
	save `crosswalk_just4', replace
	
	
	cd $input_path/2015/IAHR74DT 
	use  hv006 hv007 shdistri hv009 hv025 sh75y_* hc31_* sh74u_* sh74n_* hv105_* using IAHR74FL, clear 
	
	*year of interview
	tab hv007
	
	*rename variables
	rename (hv009 shdistri hv025 hv007) (house_size district_n4 urban_rural survey_year)
	
	*keep if rural. urban_rural == 2 is rural
	keep if urban_rural == 2
	
	*merge with crosswalk to get consistent state name 
	cd $crosswalk_path 
	merge m:1 district_n4 using `crosswalk_just4', keepusing(statename4)
	
	*************
	*create deaths and births for each year
	*sh75y_X variables list the year of
	*death in a given household 
	*i.e., a household with only one death 
	*will only have sh75y_1, a household with 
	*two deaths will have sh75y_1 and sh75y_2, etc.
	*the same logic applies for births with hc31_X
	*************
	
	*************
	*standardize age of death to years
	*************
	foreach num of numlist 1(1)5{
		replace sh74n_`num' = sh74n_`num'/365 if sh74u_`num' == 1 // days
		replace sh74n_`num' = sh74n_`num'/12 if sh74u_`num' == 2 // months
		replace sh74n_`num' = . if sh74u_`num' == 8 // don't know
	}
	
	*******************
	*get deaths for each age range
	********************
	
	foreach year of numlist 2012(1)2016{
		*number of deaths for each age range
		foreach num of numlist 1(1)5{
			*deaths below 18
			gen death`year'_lt18_`num' = sh75y_`num' == `year' & sh74n_`num' < 18
			*deaths 18-39
			gen death`year'_18_39_`num' = sh75y_`num' == `year' & sh74n_`num' >= 18 & sh74n_`num' < 40
			*deaths 40+
			gen death`year'_gte40_`num' = sh75y_`num' == `year' & sh74n_`num' >= 40
			*deaths 60+
			gen death`year'_gte60_`num' = sh75y_`num' == `year' & sh74n_`num' >=60 
		}
		
		foreach num of numlist 1(1)9{
			*births
			gen birth`year'_`num' = hc31_`num' == `year'
		}
	
		egen death_total_lt18`year' = rowtotal(death`year'_lt18_*)
		egen death_total_18_39`year' = rowtotal(death`year'_18_39_*)
		egen death_total_gte40`year' = rowtotal(death`year'_gte40_*)
		egen death_total_gte60`year' = rowtotal(death`year'_gte60_*)
		egen birth_total`year' = rowtotal(birth`year'_*)
	}
	
	*get the number of people at the time of the survey in each age range 
	*we will adjust for deaths based on age of death 
	*and for births for the below 18 group 
	foreach num of numlist 1/41 {
		local formatted_num : display %02.0f `num'
		gen hh_member_lt18_`formatted_num' = hv105_`formatted_num' < 18 
		gen hh_member_18_39_`formatted_num' = hv105_`formatted_num' >= 18 & hv105_`formatted_num' < 39 
		gen hh_member_gte40_`formatted_num' = hv105_`formatted_num' >= 40 & hv105_`formatted_num' < 98 // 95 is top code, and 98 is don't know
		gen hh_member_gte60_`formatted_num' = hv105_`formatted_num' >=60 & hv105_`formatted_num' < 98 // 95 is top code, and 98 is don't know
	}

	egen hh_total_lt18 = rowtotal(hh_member_lt18_*)
	egen hh_total_18_39 = rowtotal(hh_member_18_39_*)
	egen hh_total_gte40 = rowtotal(hh_member_gte40_*)
	egen hh_total_gte60 = rowtotal(hh_member_gte60_*)
	
	*number of houses surveyed in the district 
	gen num_houses = 1

	
	*get sum of all relevant variables at district state level
	collapse (sum) death_total* birth_total* house_size hh_total* num_houses (mean) survey_year, by(district_n4 statename4)
	
	*calculate death rates for each district-year
	*****
	*<18
	*adjust denominator for births 
	*****
	*2012: denominator = house size + deaths in 2012-2016 - births in 2013-2016
	gen death_rate_lt182012 = death_total_lt182012/(hh_total_lt18 + death_total_lt182012 + death_total_lt182013 + death_total_lt182014 + death_total_lt182015 + death_total_lt182016 - birth_total2013 - birth_total2014 - birth_total2015 - birth_total2016)
	*2013: denominator = house size + deaths in 2013-2016 - births in 2014-2016
	gen death_rate_lt182013 = death_total_lt182013/(hh_total_lt18 + death_total_lt182013 + death_total_lt182014 + death_total_lt182015 + death_total_lt182016 - birth_total2014 - birth_total2015 - birth_total2016)
	*2014: denominator = house size + deaths in 2014-2016 - births in 2015-2016
	gen death_rate_lt182014 = death_total_lt182014/(hh_total_lt18  + death_total_lt182014 + death_total_lt182015 + death_total_lt182016 - birth_total2015 - birth_total2016)
	*2015: denominator = house size + deaths in 2015-2016 - births in 2016
	gen death_rate_lt182015 = death_total_lt182015/(hh_total_lt18 + death_total_lt182015 + death_total_lt182016 - birth_total2016)
	
	*****
	*18-39
	*do not adjust denominator for births 
	*****
	*2012: denominator = house size + deaths in 2012-2016 - births in 2013-2016
	gen death_rate_18_392012 = death_total_18_392012/(hh_total_18_39 + death_total_18_392012 + death_total_18_392013 + death_total_18_392014 + death_total_18_392015 + death_total_18_392016)
	*2013: denominator = house size + deaths in 2013-2016 - births in 2014-2016
	gen death_rate_18_392013 = death_total_18_392013/(hh_total_18_39 + death_total_18_392013 + death_total_18_392014 + death_total_18_392015 + death_total_18_392016)
	*2014: denominator = house size + deaths in 2014-2016 - births in 2015-2016
	gen death_rate_18_392014 = death_total_18_392014/(hh_total_18_39  + death_total_18_392014 + death_total_18_392015 + death_total_18_392016)
	*2015: denominator = house size + deaths in 2015-2016 - births in 2016
	gen death_rate_18_392015 = death_total_18_392015/(hh_total_18_39 + death_total_18_392015 + death_total_18_392016)
	
	*****
	*gte40
	*do not adjust denominator for births 
	*****
	*2012: denominator = house size + deaths in 2012-2016 - births in 2013-2016
	gen death_rate_gte402012 = death_total_gte402012/(hh_total_gte40 + death_total_gte402012 + death_total_gte402013 + death_total_gte402014 + death_total_gte402015 + death_total_gte402016)
	*2013: denominator = house size + deaths in 2013-2016 - births in 2014-2016
	gen death_rate_gte402013 = death_total_gte402013/(hh_total_gte40 + death_total_gte402013 + death_total_gte402014 + death_total_gte402015 + death_total_gte402016)
	*2014: denominator = house size + deaths in 2014-2016 - births in 2015-2016
	gen death_rate_gte402014 = death_total_gte402014/(hh_total_gte40  + death_total_gte402014 + death_total_gte402015 + death_total_gte402016)
	*2015: denominator = house size + deaths in 2015-2016 - births in 2016
	gen death_rate_gte402015 = death_total_gte402015/(hh_total_gte40 + death_total_gte402015 + death_total_gte402016)
	
	*****
	*60+
	*do not adjust denominator for births 
	*****
	*2012: denominator = house size + deaths in 2012-2016 - births in 2013-2016
	gen death_rate_gte602012 = death_total_gte602012/(hh_total_gte60 + death_total_gte602012 + death_total_gte602013 + death_total_gte602014 + death_total_gte602015 + death_total_gte602016)
	*2013: denominator = house size + deaths in 2013-2016 - births in 2014-2016
	gen death_rate_gte602013 = death_total_gte602013/(hh_total_gte60 + death_total_gte602013 + death_total_gte602014 + death_total_gte602015 + death_total_gte602016)
	*2014: denominator = house size + deaths in 2014-2016 - births in 2015-2016
	gen death_rate_gte602014 = death_total_gte602014/(hh_total_gte60  + death_total_gte602014 + death_total_gte602015 + death_total_gte602016)
	*2015: denominator = house size + deaths in 2015-2016 - births in 2016
	gen death_rate_gte602015 = death_total_gte602015/(hh_total_gte60 + death_total_gte602015 + death_total_gte602016)
	
	
	*reshape to long 
	keep district_n4 statename4 death_rate* num_houses house_size survey_year
	reshape long death_rate_lt18 death_rate_18_39 death_rate_gte40 death_rate_gte60, i(statename4 district_n4 num_houses house_size survey_year) j(year)
	gen death_rate_lt18_p1000 = death_rate_lt18 * 1000 
	gen death_rate_18_39_p1000 = death_rate_18_39 * 1000 
	gen death_rate_gte40_p1000 = death_rate_gte40 * 1000 
	gen death_rate_gte60_p1000 = death_rate_gte60 * 1000 
	drop death_rate_lt18 death_rate_18_39 death_rate_gte40 death_rate_gte60
	
	*it seems like questions about deaths were only asked 
	*for the prior 3 years. so in 2016 they asked about 2013-2015 
	*and in 2015 they asked about 2012-2014
	*so drop the year 2012 for districts where the entire survey 
	*period is not 2015
	*similarly, for households surveyed in 2015, they 
	*did not experience a whole year 
	*so drop the year 2015 for them
	drop if year == 2012 & survey_year > 2015
	drop if year == 2015 & survey_year < 2016
			
	*append to larger dataset
	tempfile dr_by_age_distyear_all 
	save `dr_by_age_distyear_all', replace 
	
	
	*************
	*2016-2019 using the 2019-22 survey
	*************
	cd $input_path/2020/IAHR7EDT 
	use hv007 hv006 shdist hv024 hv009 hv025 sh93y_* hc31_* hv105_* sh92u_* sh92n_* using IAHR7EFL, clear 
	
	*year of interview
	tab hv007
	
	*rename variables 
	rename (hv009 shdist hv024 hv025 hv007) (house_size district5 state urban_rural survey_year)
	
	*keep if rural. urban_rural == 2 is rural
	keep if urban_rural == 2
	
	*merge with crosswalk, get districts to NFHS4 level and consistent statename
	cd $crosswalk_path 
	merge m:1 district5 using nfhs4_nfhs5_district_crosswalk,  keepusing(district_n4 statename4)
	
	*************
	*create deaths and births for each year
	*sh93y_X variables list the year of
	*death in a given household 
	*hc31_X for birth
	*************
	*************
	*standardize age of death to years
	*************
	foreach num of numlist 1(1)5{
		replace sh92n_`num' = sh92n_`num'/365 if sh92u_`num' == 1 // days
		replace sh92n_`num' = sh92n_`num'/12 if sh92u_`num' == 2 // months
		replace sh92n_`num' = . if sh92u_`num' == 8 // don't know
	}
	
	*******************
	*get deaths for each age range
	********************
	foreach year of numlist 2016(1)2021{
		*number of deaths for each age range
		foreach num of numlist 1(1)5{
			*deaths below 18
			gen death`year'_lt18_`num' = sh93y_`num' == `year' & sh92n_`num' < 18
			*deaths 18-39
			gen death`year'_18_39_`num' = sh93y_`num' == `year' & sh92n_`num' >= 18 & sh92n_`num' < 39
			*deaths gte40
			gen death`year'_gte40_`num' = sh93y_`num' == `year' & sh92n_`num' >= 40 
			*deaths 60+
			gen death`year'_gte60_`num' = sh93y_`num' == `year' & sh92n_`num' >=60 
		}
		
		foreach num of numlist 1(1)9{
			*births
			gen birth`year'_`num' = hc31_`num' == `year'
		}
	
		egen death_total_lt18`year' = rowtotal(death`year'_lt18_*)
		egen death_total_18_39`year' = rowtotal(death`year'_18_39_*)
		egen death_total_gte40`year' = rowtotal(death`year'_gte40_*)
		egen death_total_gte60`year' = rowtotal(death`year'_gte60_*)
		egen birth_total`year' = rowtotal(birth`year'_*)
	}
	
	*get the number of people at the time of the survey in each age range 
	*we will adjust for deaths based on age of death 
	*and for births for the below 18 group 
	foreach num of numlist 1/35 {
		local formatted_num : display %02.0f `num'
		gen hh_member_lt18_`formatted_num' = hv105_`formatted_num' < 18 
		gen hh_member_18_39_`formatted_num' = hv105_`formatted_num' >= 18 & hv105_`formatted_num' < 39
		gen hh_member_gte40_`formatted_num' = hv105_`formatted_num' >= 40 & hv105_`formatted_num' < 98 // 95 is top code, and 98 is don't know
		gen hh_member_gte60_`formatted_num' = hv105_`formatted_num' >=60 & hv105_`formatted_num' < 98 // 95 is top code, and 98 is don't know
	}

	egen hh_total_lt18 = rowtotal(hh_member_lt18_*)
	egen hh_total_18_39 = rowtotal(hh_member_18_39_*)
	egen hh_total_gte40 = rowtotal(hh_member_gte40_*)
	egen hh_total_gte60 = rowtotal(hh_member_gte60_*)
	
	
	*number of houses surveyed in the district 
	gen num_houses = 1
	
	*get sum of all relevant variables at district state level
	collapse (sum) death_total* birth_total* house_size hh_total* num_houses (mean) survey_year, by(district_n4 statename4)
	
	*calculate death rates for each district-year
	*****
	*<18
	*adjust denominator for births 
	*****
	*calculate death rates for each district-year
	*2016: denominator = house size + deaths in 2016-2021 - births in 2017-2021
	gen death_rate_lt182016 = death_total_lt182016 / (hh_total_lt18 + death_total_lt182016 + death_total_lt182017 + death_total_lt182018 + death_total_lt182019 + death_total_lt182020 + death_total_lt182021 - birth_total2017 - birth_total2018 - birth_total2019 - birth_total2020 - birth_total2021)
	*2017: denominator = house size + deaths in 2017-2021 - births in 2018-2021
	gen death_rate_lt182017 = death_total_lt182017 / (hh_total_lt18 + death_total_lt182017 + death_total_lt182018 + death_total_lt182019 + death_total_lt182020 + death_total_lt182021 - birth_total2018 - birth_total2019 - birth_total2020 - birth_total2021)
	*2018: denominator = house size + deaths in 2018-2021 - births in 2019-2021
	gen death_rate_lt182018 = death_total_lt182018 / (hh_total_lt18 + death_total_lt182018 + death_total_lt182019 + death_total_lt182020 + death_total_lt182021 - birth_total2019 - birth_total2020 - birth_total2021)
	*2019: denominator = house size + deaths in 2019-2021 - births in 2020-2021
	gen death_rate_lt182019 = death_total_lt182019 / (hh_total_lt18 + death_total_lt182019 + death_total_lt182020 + death_total_lt182021 - birth_total2020 - birth_total2021)
	*2020: denominator = house size + deaths in 2020-2021 - births in 2021
	gen death_rate_lt182020 = death_total_lt182020 / (hh_total_lt18 +  death_total_lt182020 + death_total_lt182021 - birth_total2021)
	
	*****
	*18-39
	*do not adjust denominator for births 
	*****
	*calculate death rates for each district-year
	*2016: denominator = house size + deaths in 2016-2021 - births in 2017-2021
	gen death_rate_18_392016 = death_total_18_392016 / (hh_total_18_39 + death_total_18_392016 + death_total_18_392017 + death_total_18_392018 + death_total_18_392019 + death_total_18_392020 + death_total_18_392021)
	*2017: denominator = house size + deaths in 2017-2021 - births in 2018-2021
	gen death_rate_18_392017 = death_total_18_392017 / (hh_total_18_39 + death_total_18_392017 + death_total_18_392018 + death_total_18_392019 + death_total_18_392020 + death_total_18_392021)
	*2018: denominator = house size + deaths in 2018-2021 - births in 2019-2021
	gen death_rate_18_392018 = death_total_18_392018 / (hh_total_18_39 + death_total_18_392018 + death_total_18_392019 + death_total_18_392020 + death_total_18_392021)
	*2019: denominator = house size + deaths in 2019-2021 - births in 2020-2021
	gen death_rate_18_392019 = death_total_18_392019 / (hh_total_18_39 + death_total_18_392019 + death_total_18_392020 + death_total_18_392021)
	*2020: denominator = house size + deaths in 2020-2021 - births in 2021
	gen death_rate_18_392020 = death_total_18_392020 / (hh_total_18_39 +  death_total_18_392020 + death_total_18_392021)
	
	*****
	*18-39
	*do not adjust denominator for births 
	*****
	*calculate death rates for each district-year
	*2016: denominator = house size + deaths in 2016-2021 - births in 2017-2021
	gen death_rate_gte402016 = death_total_gte402016 / (hh_total_gte40 + death_total_gte402016 + death_total_gte402017 + death_total_gte402018 + death_total_gte402019 + death_total_gte402020 + death_total_gte402021)
	*2017: denominator = house size + deaths in 2017-2021 - births in 2018-2021
	gen death_rate_gte402017 = death_total_gte402017 / (hh_total_gte40 + death_total_gte402017 + death_total_gte402018 + death_total_gte402019 + death_total_gte402020 + death_total_gte402021)
	*2018: denominator = house size + deaths in 2018-2021 - births in 2019-2021
	gen death_rate_gte402018 = death_total_gte402018 / (hh_total_gte40 + death_total_gte402018 + death_total_gte402019 + death_total_gte402020 + death_total_gte402021)
	*2019: denominator = house size + deaths in 2019-2021 - births in 2020-2021
	gen death_rate_gte402019 = death_total_gte402019 / (hh_total_gte40 + death_total_gte402019 + death_total_gte402020 + death_total_gte402021)
	*2020: denominator = house size + deaths in 2020-2021 - births in 2021
	gen death_rate_gte402020 = death_total_gte402020 / (hh_total_gte40 +  death_total_gte402020 + death_total_gte402021)
	
	*****
	*gte60
	*do not adjust denominator for births 
	*****
	*calculate death rates for each district-year
	*2016: denominator = house size + deaths in 2016-2021 - births in 2017-2021
	gen death_rate_gte602016 = death_total_gte602016 / (hh_total_gte60 + death_total_gte602016 + death_total_gte602017 + death_total_gte602018 + death_total_gte602019 + death_total_gte602020 + death_total_gte602021)
	*2017: denominator = house size + deaths in 2017-2021 - births in 2018-2021
	gen death_rate_gte602017 = death_total_gte602017 / (hh_total_gte60 + death_total_gte602017 + death_total_gte602018 + death_total_gte602019 + death_total_gte602020 + death_total_gte602021)
	*2018: denominator = house size + deaths in 2018-2021 - births in 2019-2021
	gen death_rate_gte602018 = death_total_gte602018 / (hh_total_gte60 + death_total_gte602018 + death_total_gte602019 + death_total_gte602020 + death_total_gte602021)
	*2019: denominator = house size + deaths in 2019-2021 - births in 2020-2021
	gen death_rate_gte602019 = death_total_gte602019 / (hh_total_gte60 + death_total_gte602019 + death_total_gte602020 + death_total_gte602021)
	*2020: denominator = house size + deaths in 2020-2021 - births in 2021
	gen death_rate_gte602020 = death_total_gte602020 / (hh_total_gte60 +  death_total_gte602020 + death_total_gte602021)

	*reshape to long 
	keep district_n4 statename4 death_rate* num_houses house_size survey_year
	reshape long death_rate_lt18 death_rate_18_39 death_rate_gte40 death_rate_gte60, i(statename4 district_n4 num_houses house_size survey_year) j(year)
	gen death_rate_lt18_p1000 = death_rate_lt18 * 1000 
	gen death_rate_18_39_p1000 = death_rate_18_39 * 1000 
	gen death_rate_gte40_p1000 = death_rate_gte40 * 1000 
	gen death_rate_gte60_p1000 = death_rate_gte60 * 1000 
	drop death_rate_lt18 death_rate_18_39 death_rate_gte40 death_rate_gte60
	
	*so in 2020-21 they asked about 2017-2019
	*and in 2019 they asked about 2016-19
	*so drop the year 2016 for districts where the entire survey 
	*period is not 2019
	*similarly, for households surveyed in 2019, they 
	*did not experience a whole year 
	*so drop the year 2019 for them
	*drop 2020 for households interviewed before 2021
	drop if year == 2016 & survey_year > 2019
	drop if year == 2019 & survey_year < 2020
	drop if year == 2020 & survey_year < 2021
	
	
	*append to whole dataset
	append using `dr_by_age_distyear_all'
	save `dr_by_age_distyear_all', replace 
	
	**********************
	*save final product 
	**********************
	sort statename4 district_n4 year 
	
	*drop delhi
	drop if statename4 == "Delhi" | missing(statename4)
	
	*drop if ever missing the death rate (urban areas)
	drop if missing(death_rate_lt18_p1000) | missing(death_rate_18_39_p1000) | missing(death_rate_gte40_p1000) | missing(death_rate_gte60_p1000)
	
	compress
	cd $output_path
	save nfhs_death_rate_by_age_rural_district_n4_year_2012_2020, replace 
}


